# **MODIFY THIS CHUNK**
library(here)
kundaje_dir <- trimws(readr::read_lines("../../AK_PROJ_DIR.txt"))
doc_id <- "01"
out <- here("output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)
figout <- here("figures/03-chrombpnet/03-syntax", doc_id, "/"); dir.create(figout, recursive = TRUE)In this notebook, we do global investigations of the de novo motifs:
library(glue)
library(scales)
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(plotly)
library(ggseqlogo)
script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))
ggplot2::theme_set(theme_BOR())Cluster metadata:
cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrombpnet_models_keep <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"),
col_names = c("Cluster", "Folds_keep", "Cluster_ID"))## Rows: 189 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Cluster, Folds_keep, Cluster_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cluster_order <- cluster_meta %>% arrange(organ, L1_annot) %>% pull(Cluster_ChromBPNet)
organs <- unique(cluster_meta$organ) %>% sort()
organs[organs == "StomachEsophagus"] <- "Stomach"
head(chrombpnet_models_keep)## # A tibble: 6 × 3
## Cluster Folds_keep Cluster_ID
## <chr> <chr> <chr>
## 1 Adrenal_c0 fold_0,fold_1,fold_2,fold_3,fold_4 AG_0
## 2 Adrenal_c1 fold_0,fold_1,fold_2,fold_3,fold_4 AG_1
## 3 Adrenal_c2 fold_0,fold_1,fold_2,fold_3,fold_4 AG_2
## 4 Adrenal_c3 fold_0,fold_1,fold_2,fold_3,fold_4 AG_3
## 5 Brain_c0 fold_0,fold_1,fold_2,fold_3,fold_4 BR_0
## 6 Brain_c1 fold_0,fold_1,fold_2,fold_3,fold_4 BR_1
Load metrics:
chrombpnet_metrics <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_metrics.tsv")) %>%
dplyr::select(Cluster_ChromBPNet, total_frags) %>%
distinct()## Rows: 1015 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): organ, Cluster_ChromBPNet, L1_annot, Model, Fold
## dbl (13): number_of_cells, total_frags, counts_metrics.peaks.spearmanr, coun...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Load number of peaks:
npeaks <- read_table(here("output/03-chrombpnet/01-models/qc/npeaks.tsv"),
col_names = c("n_peaks", "peak_file")) %>%
filter(peak_file != "total")##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## n_peaks = col_double(),
## peak_file = col_character()
## )
nrow(npeaks) == 203## [1] TRUE
npeaks <- npeaks %>%
rowwise() %>%
mutate(peak_file = basename(peak_file)) %>%
separate(peak_file, into = c("Cluster_ChromBPNet", "drop"), sep = "__") %>%
dplyr::select(-drop)
cluster_meta <- cluster_meta %>%
left_join(npeaks, by = "Cluster_ChromBPNet") %>%
left_join(chrombpnet_metrics, by = "Cluster_ChromBPNet")Load merged modisco reports:
modisco_merged <- read_tsv(here("output/03-chrombpnet/02-compendium/modisco_compiled_anno/modisco_merged_reports.tsv")) %>%
# get short cluster ID
left_join(cluster_meta %>% dplyr::select(Cluster, Cluster_ChromBPNet),
by = c("component_celltype" = "Cluster_ChromBPNet"))## Rows: 6362 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): merged_pattern, component_celltype, pattern_class, pattern, compone...
## dbl (1): n_seqlets
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(modisco_merged)## # A tibble: 6 × 7
## merged_pattern component_celltype pattern_class pattern n_seqlets
## <chr> <chr> <chr> <chr> <dbl>
## 1 neg.Average_12__merged_pat… Spleen_c5 neg_patterns patter… 88
## 2 neg.Average_12__merged_pat… Spleen_c0 neg_patterns patter… 461
## 3 neg.Average_12__merged_pat… Skin_c5 neg_patterns patter… 302
## 4 neg.Average_12__merged_pat… Heart_c13 neg_patterns patter… 128
## 5 neg.Average_12__merged_pat… Skin_c7 neg_patterns patter… 708
## 6 neg.Average_12__merged_pat… Heart_c5 neg_patterns patter… 47
## # ℹ 2 more variables: component_pattern <chr>, Cluster <chr>
Load annotated patterns:
# manually annotated motif patterns
motifs_compiled_all <- read_tsv("../02-compendium/04d-ChromBPNet_de_novo_motifs.tsv") %>%
mutate(motif_name = paste0(idx_uniq, "|", annotation)) %>%
dplyr::relocate(motif_name, .after = pattern) %>%
dplyr::relocate(idx_uniq, .after = idx) %>%
# rename categories
mutate(category = ifelse(annotation == "exclude", "exclude", category),
category = case_match(category,
"composite_homo" ~ "homocomposite",
"composite_hetero" ~ "heterocomposite",
.default = category),
category = factor(category, levels = names(cmap_category)),
pattern_class = dplyr::recode(pattern_class, pos = "pos_patterns", neg = "neg_patterns"))## Rows: 834 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): pattern_class, pattern, component_organs, component_celltypes, que...
## dbl (14): idx, total_n_seqlets, n_component_patterns, n_component_celltypes,...
## lgl (6): modisco_cwm_fwd, modisco_cwm_rev, match0_logo, match0_logo_jolma, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
motifs_compiled <- motifs_compiled_all %>%
# filter out exclude motifs
filter(category != "exclude") %>%
# extract the name of the known motif
separate(match0, into = c("TF0", "motif0"), sep = ": ")
# for cases with multiple patterns merged into one motif, take the one with the most
# number of seqlets.
motifs_compiled_unique <- motifs_compiled %>%
group_by(idx_uniq) %>%
slice_max(n = 1, order_by = tibble(total_n_seqlets, n_component_celltypes),
with_ties = FALSE)
dim(motifs_compiled_unique)## [1] 508 43
# get positive and negative patterns
pos_patterns <- motifs_compiled_unique %>% filter(pattern_class == "pos_patterns") %>% pull(motif_name)
length(pos_patterns)## [1] 493
neg_patterns <- motifs_compiled_unique %>% filter(pattern_class == "neg_patterns") %>% pull(motif_name)
length(neg_patterns)## [1] 15
# initial breakdown of motif categories, prior manual collapsing
table(motifs_compiled_all$category)##
## base base_with_flank homocomposite heterocomposite unresolved
## 147 128 135 263 43
## repeat partial exclude
## 7 4 107
# breakdown after manual collapsing & removing low-quality motifs
table(motifs_compiled_unique$category)##
## base base_with_flank homocomposite heterocomposite unresolved
## 86 51 107 222 38
## repeat partial exclude
## 2 2 0
dim(motifs_compiled_unique)## [1] 508 43
Thus, in total we have:
NOTE: There are 509 unique motifs, and the
labels go from 0 to 508 (which spans 509 values). The “missing label” is
for idx_uniq 466, which corresponded to unresolved motifs
which were filtered out.
Load reconciled hit calls:
finemo_param <- "counts_v0.23_a0.8_all"
# load hit counts per motif per cell type
hits <- map(chrombpnet_models_keep$Cluster,
~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_motif.tsv")),
data.table = FALSE) %>%
dplyr::rename(n = count) %>%
tibble::add_column(Cluster = .x, .before = 1)) %>%
setNames(chrombpnet_models_keep$Cluster)
# sanity check
n_cols <- map_dbl(hits, ncol)
if (any(n_cols < 12)) {
missing_cols <- names(n_cols[n_cols < 12])
stop("@ Missing columns in ", glue_collapse(missing_cols, ", "))
}
saveRDS(hits, file = glue("{out}/hits.Rds"))
# load summary stats on hits per peak per cell type
hits_per_peak <- map_dfr(chrombpnet_models_keep$Cluster,
~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_peak.tsv")),
data.table = FALSE) %>%
tibble::add_column(Cluster = .x, .before = 1))Calculate total number of hits per pattern per cell type:
hits_all <- bind_rows(hits)
if (!all(motifs_compiled_unique$motif_name %in% hits_all$motif_name) | !all(hits_all$motif_name %in% motifs_compiled$motif_name)) {
stop("@ Motif names in annotation don't match motif names in hits.")
}
hits_all_totals <- hits_all %>%
group_by(motif_name) %>%
summarize(total_hits = sum(n))
# organ w/ most hits
hits_top_organ <- hits_all %>%
left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>%
group_by(motif_name, organ) %>%
summarize(hits_per_organ = sum(n)) %>%
group_by(motif_name) %>%
slice_max(n = 1, order_by = hits_per_organ, with_ties = FALSE) %>%
dplyr::select(motif_name, top_organ = organ)## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'motif_name'. You can override using the
## `.groups` argument.
dim(hits_all_totals)## [1] 508 2
dim(hits_top_organ)## [1] 508 2
motifs_compiled_unique <- hits_all_totals %>%
left_join(motifs_compiled_unique, by = "motif_name") %>%
left_join(hits_top_organ, by = "motif_name") %>%
mutate(motif_name_safe = gsub("\\/|\\||\\#", ".", motif_name)) %>%
mutate(modisco_cwm_fwd = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.fwd.png")),
modisco_cwm_rev = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.rev.png")))
# also subset to positive and neg patterns
motifs_compiled_unique_pos <- motifs_compiled_unique %>%
filter(pattern_class == "pos_patterns")
motifs_compiled_unique_neg <- motifs_compiled_unique %>%
filter(pattern_class == "neg_patterns")
dim(motifs_compiled_unique_pos)## [1] 493 46
hits_all_anno <- hits_all %>%
left_join(cluster_meta %>% dplyr::rename(Cluster_short = Cluster), by = c("Cluster" = "Cluster_ChromBPNet")) %>%
left_join(motifs_compiled_unique %>% dplyr::select(motif_name, pattern, annotation_broad, category))## Joining with `by = join_by(motif_name)`
head(hits_all_anno)## Cluster motif_name pattern_class n Distal
## 1 Adrenal_c0 392|NR:ESRRA/NR5A pos_patterns 158975 52205
## 2 Adrenal_c0 456|ZEB/SNAI neg_patterns 122904 31275
## 3 Adrenal_c0 132|BZIP:FOSL/JUND#1 pos_patterns 52506 17353
## 4 Adrenal_c0 436|SP/KLF pos_patterns 45828 6164
## 5 Adrenal_c0 260|GATA#1 pos_patterns 33086 11270
## 6 Adrenal_c0 507|unresolved,repressive,YY1/2-like neg_patterns 32037 8234
## Exonic Intronic Promoter mean_distToTSS median_distToTSS
## 1 9379 84616 12775 20180.92 8859.0
## 2 8002 48500 35127 11567.57 2944.0
## 3 2527 28584 4042 20839.47 9456.0
## 4 2238 9426 28000 3203.65 140.0
## 5 1370 18616 1830 22932.57 10593.5
## 6 2288 12955 8560 12456.09 3584.0
## mean_distToPeakSummit median_distToPeakSummit Cluster_short organ
## 1 129.77 67 AG_0 Adrenal
## 2 171.96 128 AG_0 Adrenal
## 3 114.59 50 AG_0 Adrenal
## 4 88.78 52 AG_0 Adrenal
## 5 128.59 74 AG_0 Adrenal
## 6 150.93 114 AG_0 Adrenal
## organ_code cluster_id compartment L1_annot
## 1 AG 0 epi AG_0_adrenal cortex 1
## 2 AG 0 epi AG_0_adrenal cortex 1
## 3 AG 0 epi AG_0_adrenal cortex 1
## 4 AG 0 epi AG_0_adrenal cortex 1
## 5 AG 0 epi AG_0_adrenal cortex 1
## 6 AG 0 epi AG_0_adrenal cortex 1
## L2_annot L3_annot dend_order ncell median_numi
## 1 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 2 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 3 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 4 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 5 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 6 Adrenal adrenal cortex adrenal cortex 7 823 9498
## median_ngene median_nfrags median_tss median_frip
## 1 3280 6784 9.832 0.2906086
## 2 3280 6784 9.832 0.2906086
## 3 3280 6784 9.832 0.2906086
## 4 3280 6784 9.832 0.2906086
## 5 3280 6784 9.832 0.2906086
## 6 3280 6784 9.832 0.2906086
## note
## 1 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 2 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 3 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 4 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 5 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 6 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## organ_color compartment_color n_peaks total_frags
## 1 #876941 #11A579 121634 7407892
## 2 #876941 #11A579 121634 7407892
## 3 #876941 #11A579 121634 7407892
## 4 #876941 #11A579 121634 7407892
## 5 #876941 #11A579 121634 7407892
## 6 #876941 #11A579 121634 7407892
## pattern annotation_broad category
## 1 pos.Average_256__merged_pattern_0 NR base
## 2 neg.Average_12__merged_pattern_0 ZEB/SNAI base
## 3 pos.Average_287__merged_pattern_0 BZIP base
## 4 pos.Average_212__merged_pattern_0 SP/KLF base
## 5 pos.Average_248__merged_pattern_0 GATA base
## 6 neg.Average_15__merged_pattern_1 unresolved unresolved
# breakdown of motif categories per pos patterns
table(motifs_compiled_unique_pos$category)##
## base base_with_flank homocomposite heterocomposite unresolved
## 81 51 102 221 34
## repeat partial exclude
## 2 2 0
# save as TSV for re-use
motifs_compiled_unique %>% write_tsv(glue("{out}/motifs_compiled_unique.tsv"))
# short version
motifs_compiled_unique %>%
arrange(idx_uniq) %>%
dplyr::select(motif_name, total_hits, idx, idx_uniq, pattern_class, pattern,
annotation, annotation_broad, category,
TF0, motif0, qval0, top_organ, component_organs, component_celltypes) %>%
write_tsv(glue("{out}/motifs_compiled_unique_short.tsv"))
hits_all_anno %>% write_tsv(glue("{out}/hits_per_cluster_annotated.tsv"))Prep the hit counts matrices across cell types:
dim(hits_all)## [1] 33054 12
mat_hits <- hits_all %>%
filter(pattern_class == "pos_patterns") %>%
left_join(motifs_compiled_unique_pos, by = c("motif_name", "pattern_class")) %>%
filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
dplyr::select(Cluster, motif_name, n) %>%
spread(motif_name, n, fill = 0) %>%
tibble::column_to_rownames(var = "Cluster") %T>%
write_tsv(glue("{out}/hit_matrix.pos_patterns_no_unresolved.tsv"))
dim(mat_hits)## [1] 189 455
hits_all_broad <- hits_all %>%
filter(pattern_class == "pos_patterns") %>%
left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
group_by(annotation_broad, Cluster) %>%
summarize(n = sum(n))## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
mat_hits_broad <- hits_all_broad %>%
spread(annotation_broad, n, fill = 0) %>%
tibble::column_to_rownames(var = "Cluster") %T>%
write_tsv(glue("{out}/hit_matrix.broad_pos_patterns_no_unresolved.tsv"))
dim(mat_hits_broad)## [1] 189 169
mat_hits_unresolved <- hits_all %>%
filter(pattern_class == "pos_patterns") %>%
left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
filter(category == "unresolved") %>%
dplyr::select(motif_name, n, Cluster) %>%
spread(motif_name, n, fill = 0) %>%
as.data.frame() %>%
tibble::column_to_rownames(var = "Cluster")
dim(mat_hits_unresolved)## [1] 189 34
Load hit counts for binned distances to nucleosomes:
# load motif hit counts per binned distance to nucleosome dyads
hit_dyad_dist <- map_dfr(chrombpnet_models_keep$Cluster,
~ data.table::fread(
here(glue("output/03-chrombpnet/02-compendium/nucleoatac/{.x}/{.x}.motif_dist_to_dyad.tsv")),
data.table = FALSE) %>%
tibble::add_column(Cluster = .x, .before = 1))
hit_dyad_dist <- left_join(hit_dyad_dist, motifs_compiled_unique, by = "motif_name")Load hit counts for binned distances to peak summits:
# load motif hit counts per binned distance to nucleosome dyads
hit_summit_dist <- map_dfr(chrombpnet_models_keep$Cluster,
~ data.table::fread(
here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/counts_v0.23_a0.8_all/motif_dist_to_summit.tsv")),
data.table = FALSE) %>%
tibble::add_column(Cluster = .x, .before = 1))
hit_summit_dist <- left_join(hit_summit_dist, motifs_compiled_unique, by = "motif_name")Load HOCOMOCO:
hocomoco <- read_tsv(here("output/02-global_analysis/04/hocomoco_unique.tsv"))## Rows: 949 Columns: 662
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (45): name, original_motif.name, original_motif.tf, original_motif.moti...
## dbl (617): original_motif.subtype_info.datatype_counts.P, original_motif.sub...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique_classes <- unique(hocomoco$masterlist_info.tfclass_class)
cmap_class <- c(ArchR::paletteDiscrete(unique_classes, set = "stallion"), "N/A" = "gray90")## Length of unique values greater than palette, interpolating..
Load CWMs for the compiled motifs:
# use custom helper
cwm_list <- read_cwm_meme(here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/modisco_compiled.memedb.txt")))## Could not find strand info, assuming +.
length(cwm_list)## [1] 834
cwm_list <- cwm_list[base::intersect(names(cwm_list), motifs_compiled_unique$pattern)]
length(cwm_list)## [1] 508
names(cwm_list) <- plyr::mapvalues(names(cwm_list), from = motifs_compiled_unique$pattern, to = motifs_compiled_unique$motif_name)
# label matrix dimensions
cwm_list <- map(cwm_list, function(cwm) {
rownames(cwm) <- c("A", "C", "G", "T")
colnames(cwm) <- seq(1:30)
return(cwm)
})
print(length(cwm_list))## [1] 508
saveRDS(cwm_list, file = glue("{out}/denovo_cwm_list.Rds"))save(chrombpnet_models_keep, cluster_meta, motifs_compiled_unique, motifs_compiled_all,
cwm_list, hit_dyad_dist, hit_summit_dist, hits_all_anno, hits_per_peak,
file = glue("{out}/intermediate_obj.Rda"))To reload:
load(glue("{out}/intermediate_obj.Rda"))
hits <- readRDS(glue("{out}/hits.Rds"))
hits_all <- bind_rows(hits)fs::dir_create(file.path(out, "trimmed_cwm_logos"))
all(fs::file_exists(motifs_compiled_unique$modisco_cwm_fwd))
all(fs::file_exists(motifs_compiled_unique$modisco_cwm_rev))
for (i in 1:nrow(motifs_compiled_unique)) {
new_name_fw <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.fwd.png")
fs::file_copy(motifs_compiled_unique$modisco_cwm_fwd[i], file.path(out, "trimmed_cwm_logos", new_name_fw), overwrite = TRUE)
new_name_rev <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.rev.png")
fs::file_copy(motifs_compiled_unique$modisco_cwm_rev[i], file.path(out, "trimmed_cwm_logos", new_name_rev), overwrite = TRUE)
}motifs_compiled_unique %>%
mutate(
class = case_when(
pattern_class == "pos_patterns" ~ "Activating",
TRUE ~ "Reducing"
),
cwm_logo_fwd = paste0(motif_name_safe, "__cwm.fwd.png"),
cwm_logo_rev = paste0(motif_name_safe, "__cwm.rev.png"),
) %>%
dplyr::select(
idx_uniq,
motif_name,
motif_name_safe,
class,
total_instances_across_celltypes = total_hits,
cwm_logo_fwd,
cwm_logo_rev,
query_consensus,
annotation,
annotation_broad,
category,
best_match_TF = TF0,
best_match_motif = motif0,
best_match_TOMTOM_qval = qval0,
best_match_TFs_in_family = match0_TFs_in_family,
best_match_TFs_in_Vierstra_archetype = match0_TFs_in_archetype,
total_seqlets_across_celltypes = total_n_seqlets,
merged_pattern = pattern,
n_constituent_motifs = n_component_patterns,
n_constituent_celltypes = n_component_celltypes,
constituent_organs = component_organs,
constituent_celltypes = component_celltypes
) %>%
arrange(idx_uniq) %>%
write_csv(glue("{out}/TABLE_motif_compendium.csv"))What kinds of patterns did we detect?
First, let’s look at all 834 motifs from the motif clustering workflow.
p1 <- motifs_compiled_all %>%
group_by(category) %>%
count() %>%
# reverse levels since we're flipping coordinates
mutate(category = factor(category, levels = rev(names(cmap_category)))) %>%
ggplot(aes(x = category, y = n)) +
geom_col(aes(fill = category)) +
geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
coord_flip() +
scale_fill_manual(values = cmap_category) +
ggtitle("De novo deep learning derived motifs \n(after MoDISco agggregation step)") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90")) +
ylim(c(0, 280))
p2 <- motifs_compiled_all %>%
ggplot(aes(x = "breakdown")) +
geom_bar(aes(fill = category), position = "fill") +
scale_fill_manual(values = cmap_category)
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(0.65, 0.35))Next, let’s look at the unique motifs after QC and collapsing motifs:
# stacked barplot
p1 <- motifs_compiled_unique %>%
ggplot(aes(x = "breakdown")) +
geom_bar(aes(fill = category), position = "fill") +
scale_fill_manual(values = cmap_category) +
xlab(NULL) +
theme(legend.position = "bottom") +
no_legend()
# barplot of counts per category
p2 <- motifs_compiled_unique %>%
group_by(category) %>%
count() %>%
# reverse levels since we're flipping coordinates
mutate(category = factor(category, levels = rev(names(cmap_category)))) %>%
ggplot(aes(x = category, y = n)) +
geom_col(aes(fill = category)) +
geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
coord_flip() +
scale_fill_manual(values = cmap_category) +
ggtitle("De novo deep learning derived motifs \n(after manual collapsing & removing low quality motifs)") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90")) +
ylim(c(0, 280))
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(1, 4, 3.5))p1 <- motifs_compiled_unique %>%
group_by(pattern_class) %>%
count() %>%
ggplot(aes(x = "1", y = n)) +
geom_col(aes(fill = pattern_class), alpha = 0.4) +
coord_flip() +
scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
ggtitle("De novo deep learning derived motifs") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90"))
p2 <- hits_all %>%
left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>%
group_by(organ, pattern_class) %>%
summarize(n_hits = sum(n)) %>%
mutate(organ = factor(organ, levels = rev(unique(.$organ)))) %>%
ggplot(aes(x = organ, y = n_hits)) +
geom_col(aes(fill = pattern_class), alpha = 0.4, position = "fill") +
coord_flip() +
scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
ggtitle("Breakdown of total hits") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90"))## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'organ'. You can override using the
## `.groups` argument.
plot_grid(p1, p2, ncol = 1, align = "v", axis = "rl", rel_heights = c(1, 4))Distribution of TOMTOM q-values for similarity to known motifs, and total # hits per motif:
p1 <- motifs_compiled_unique %>%
mutate(similarity = qval0) %>%
ggplot(aes(x = category, y = -log10(qval0))) +
geom_boxplot(aes(fill = category), outliers = FALSE) +
geom_jitter(shape = 21, width = 0.2, size = 0.5) +
scale_fill_manual(values = cmap_category) +
geom_hline(yintercept = -log10(0.05)) +
no_legend() +
xlab(NULL) +
rotate_x()
p2 <- motifs_compiled_unique %>%
ggplot(aes(x = category, y = log10(total_hits))) +
geom_boxplot(aes(fill = category), outlier.colour = NA) +
geom_jitter(color = "black", width = 0.2, size = 0.5, shape = 21) +
scale_fill_manual(values = cmap_category) +
rotate_x() +
no_legend() +
# ggtitle("Total # of hits per unique motif \neach point is a motif (n=525)") +
coord_cartesian(ylim = c(1, 7))
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb")Total number of hits per cell type:
hits_all %>%
group_by(Cluster, pattern_class) %>%
summarize(total_hits = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>%
ggplot(aes(x = Cluster, y = total_hits)) +
facet_wrap(~ pattern_class, ncol = 1) +
geom_bar(stat = "identity", aes(fill = organ)) +
scale_fill_manual(values = cmap_organ) +
rotate_x() +
ylab("Total number of hits") +
theme(
strip.background = element_blank(),
axis.text.x = element_text(size = 5),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Total hits per cell type")## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
By class (positive/negative patterns):
hits_all %>%
left_join(motifs_compiled_unique %>% dplyr::select(-pattern_class),
by = c("motif_name")) %>%
group_by(Cluster, pattern_class) %>%
summarize(n_per_class = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>%
ggplot(aes(x = Cluster, y = n_per_class)) +
geom_bar(stat = "identity", aes(fill = pattern_class), alpha = 0.4, position = "fill") +
scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
rotate_x() +
ylab("Total number of hits") +
theme(
axis.text.x = element_text(size = 5),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Total hits, by pattern class")## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
Let’s plot total hits and mean hits per peak, grouped by organ:
p1 <- hits_all %>%
group_by(Cluster, pattern_class) %>%
summarize(total_hits = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster)),
pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>%
ggplot(aes(x = organ, y = total_hits)) +
geom_jitter(stat = "identity", aes(fill = organ), shape = 21, color = "white", size = 3, alpha = 0.6, width = 0.25) +
scale_fill_manual(values = cmap_organ) +
rotate_x() +
facet_wrap(~ pattern_class, ncol = 1) +
ylab("Total number of hits") +
theme(
strip.background = element_blank(),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Total hits (+ patterns)") +
no_legend()## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
p2 <- hits_per_peak %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>%
gather(pattern_class, median_n_hits, median_pos_patterns, median_neg_patterns) %>%
mutate(pattern_class = factor(pattern_class, levels = c("median_pos_patterns", "median_neg_patterns"))) %>%
ggplot(aes(x = organ, y = median_n_hits)) +
ggbeeswarm::geom_quasirandom(
aes(fill = organ), shape = 21, color = "white", size = 3, alpha = 0.6) + #, width = 0.25, height = 0) +
scale_fill_manual(values = cmap_organ) +
facet_wrap(~ pattern_class, ncol = 1) +
rotate_x() +
ylab("Median number of hits per peak (+ patterns)") +
theme(
strip.background = element_blank(),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(breaks = seq(0, 10, 1), labels = seq(0, 10), limits = c(0, 9)) +
no_legend() +
ggtitle("Median hits per peak (+ patterns)")
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb")Here we look at how total number of motif instances varies with total fragments and number of peaks:
# calculate total hist per cluster (incl. both pos/neg patterns)
total_hits_per_cluster <- hits_all %>%
group_by(Cluster) %>%
summarize(total_hits = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))
# this is for the 189 models which passed QC
total_hits_per_cluster$Cluster %>% unique %>% length## [1] 189
summary(total_hits_per_cluster$total_hits)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73500 457034 664835 839544 1110354 2593062
cor_frags <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
summary(cor_frags)##
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1266989 -240087 -88317 214846 1610838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.052e+05 3.600e+04 14.03 <2e-16 ***
## total_hits_per_cluster$total_frags 8.947e-03 6.326e-04 14.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 373300 on 187 degrees of freedom
## Multiple R-squared: 0.5169, Adjusted R-squared: 0.5143
## F-statistic: 200 on 1 and 187 DF, p-value: < 2.2e-16
cor_peaks <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
summary(cor_peaks)##
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -555859 -99658 6559 109954 652260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.796e+05 3.305e+04 -5.435 1.69e-07 ***
## total_hits_per_cluster$n_peaks 8.802e+00 2.567e-01 34.294 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 198900 on 187 degrees of freedom
## Multiple R-squared: 0.8628, Adjusted R-squared: 0.8621
## F-statistic: 1176 on 1 and 187 DF, p-value: < 2.2e-16
p1 <- total_hits_per_cluster %>%
ggplot(aes(x = log10(total_frags), y = total_hits)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = label_number(scale = 1/1e6)) +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
x = 7, y = 2e6, size = 5) +
xlab("log10(Total fragments)") + ylab("Total hits (x 1M)") +
square() +
ggtitle("# motif instances vs. \n# fragments") +
no_legend()
p2 <- total_hits_per_cluster %>%
ggplot(aes(x = n_peaks, y = total_hits)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = label_number(scale = 1/1e6)) +
scale_x_continuous(labels = scales::comma) +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
x = 7e4, y = 2e6, size = 5) +
xlab("Total peaks") + ylab("Total hits (x 1M)") +
square() +
ggtitle("# motif instances vs. \n# peaks") +
no_legend()
# median number of instances for positive motifs per peaks
n_hits_per_peaks <- hits_per_peak %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))
cor_frags <- lm(n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
summary(cor_frags)##
## Call:
## lm(formula = n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0016 -0.9635 -0.0603 0.7646 3.3597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.942e+00 9.692e-02 40.67 < 2e-16 ***
## n_hits_per_peaks$total_frags 8.634e-09 1.703e-09 5.07 9.52e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.005 on 187 degrees of freedom
## Multiple R-squared: 0.1209, Adjusted R-squared: 0.1162
## F-statistic: 25.71 on 1 and 187 DF, p-value: 9.516e-07
cor_peaks <- lm(n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
summary(cor_peaks)##
## Call:
## lm(formula = n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73318 -0.57254 -0.00856 0.49256 2.90880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.991e+00 1.449e-01 20.645 <2e-16 ***
## n_hits_per_peaks$n_peaks 1.100e-05 1.125e-06 9.779 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8717 on 187 degrees of freedom
## Multiple R-squared: 0.3383, Adjusted R-squared: 0.3348
## F-statistic: 95.63 on 1 and 187 DF, p-value: < 2.2e-16
p3 <- n_hits_per_peaks %>%
ggplot(aes(x = log10(total_frags), y = median_pos_patterns)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
no_legend() +
square() +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
x = 7.5, y = 7, size = 5) +
xlab("log10(Total fragments)") + ylab("median # positive hits per peak") +
ggtitle("median positive hits per peak vs. \n# fragments")
p4 <- n_hits_per_peaks %>%
ggplot(aes(x = n_peaks, y = median_pos_patterns)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma) +
no_legend() +
square() +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
x = 70000, y = 7, size = 5) +
xlab("total number of peaks") + ylab("median # positive hits per peak") +
ggtitle("median positive hits per peak vs. \n# peaks")
# calculate quartiles
n_hits_per_peaks$n_peak_quartile <- ntile(n_hits_per_peaks$n_peaks, n=4)
table(n_hits_per_peaks$n_peak_quartile)/sum(table(n_hits_per_peaks$n_peak_quartile))##
## 1 2 3 4
## 0.2539683 0.2486772 0.2486772 0.2486772
n_hits_per_peaks$total_frags_quartile <- ntile(n_hits_per_peaks$total_frags, n=4)
table(n_hits_per_peaks$total_frags_quartile)/sum(table(n_hits_per_peaks$total_frags_quartile))##
## 1 2 3 4
## 0.2539683 0.2486772 0.2486772 0.2486772
p5 <- n_hits_per_peaks %>%
ggplot(aes(x = total_frags_quartile, y = median_pos_patterns, group = total_frags_quartile)) +
geom_boxplot(fill = "gray90", outliers = FALSE) +
ggbeeswarm::geom_quasirandom(
aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
scale_fill_manual(values = cmap_organ) +
no_legend() +
ylab("Median # positive hits per peak") +
xlab("Quantile (total number of fragments)") +
ggtitle("median positive hits per peak vs. \n# fragments")
p6 <- n_hits_per_peaks %>%
ggplot(aes(x = n_peak_quartile, y = median_pos_patterns, group = n_peak_quartile)) +
geom_boxplot(fill = "gray90", outliers = FALSE) +
ggbeeswarm::geom_quasirandom(
aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
scale_fill_manual(values = cmap_organ) +
no_legend() +
ylab("Median # positive hits per peak") +
xlab("Quantile (total number of peaks)") +
ggtitle("median positive hits per peak vs. \n# peaks")
plot_grid(p1, p3, p5, p2, p4, p6, align = "hv", axis = "rltb")First we need to aggregate the counts per bin over motif variants and cell types, per category:
# total counts
motifs_keep <- hit_dyad_dist %>%
filter(category %in% c("base", "base_with_flank")) %>%
group_by(annotation_broad) %>%
summarize(sum_all_counts = sum(n)) %>%
arrange(sum_all_counts) %>%
filter(sum_all_counts > 50000) %>%
pull(annotation_broad)
motifs_keep %>% length## [1] 40
medians_dist_dyad <- hit_dyad_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist) %>%
summarize(sum_counts = sum(n)) %>%
# we need to actually calculate the median given the binned counts
slice_max(order_by = sum_counts, n = 1) %>%
arrange(bin_dist)## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
# z-score
hit_dyad_dist_summed <- hit_dyad_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist) %>%
summarize(sum_counts = sum(n)) %>%
# zscore counts per motif (row-wise)
mutate(zscore = scale(sum_counts)) %>%
mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
bin_dist = as.numeric(bin_dist)) %>%
ungroup()## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
p1 <- hit_dyad_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = zscore)) +
scale_fill_gradientn(colours = rdbu2,
# set midpoint to 0
# https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
geom_vline(xintercept = 75, linewidth = 1) +
scale_x_continuous(breaks = seq(10, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from nucleosome dyad (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")We can plot the heatmap w/ the actual counts on top so that we can manually inspect:
hit_dyad_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = zscore)) +
geom_text(aes(label = round(sum_counts, 2)), color = "black") +
scale_fill_gradientn(colours = rdbu2,
# set midpoint to 0
# https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
rescaler = ~ scales::rescale_mid(.x, mid = 0)) +
geom_vline(xintercept = 75, linewidth = 1) +
scale_x_continuous(breaks = seq(10, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from nucleosome dyad (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")hit_summit_dist_summed <- hit_summit_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist) %>%
summarize(sum_counts = sum(n)) %>%
filter(bin_dist <= 250) %>%
# zscore counts per motif (row-wise)
mutate(zscore = scale(sum_counts)) %>%
mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
bin_dist = as.numeric(bin_dist)) %>%
ungroup()## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
hit_summit_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = sum_counts)) +
scale_fill_viridis_c("plasma") +
scale_x_continuous(breaks = seq(0, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from peak summit (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")p2 <- hit_summit_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = zscore)) +
scale_fill_gradientn(colours = rdbu2,
# set midpoint to 0
# https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
scale_x_continuous(breaks = seq(0, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from peak summit (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")hit_summit_dist_summed <- hit_summit_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist, pattern_class) %>%
summarize(sum_counts = sum(n))## `summarise()` has grouped output by 'annotation_broad', 'bin_dist'. You can
## override using the `.groups` argument.
hit_summit_dist_summed %>%
filter(annotation_broad != "unresolved") %>%
group_by(annotation_broad) %>%
mutate(cum_sum_counts = cumsum(sum_counts)) %>%
mutate(pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>%
ggplot(aes(x = bin_dist, y = log10(cum_sum_counts), group = annotation_broad)) +
# geom_point(aes(fill = pattern_class), alpha = 0.4, color = "white", shape = 21) +
geom_line(aes(color = pattern_class), alpha = 0.4) +
facet_wrap(~ pattern_class, ncol = 1) +
scale_color_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
scale_fill_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
theme(panel.grid.minor.x = element_line(color = "gray90"),
panel.grid.major.x = element_line(color = "gray90"),
strip.background.x = element_blank()) +
no_legend()hit_summit_dist_coarse <- hit_summit_dist %>%
dplyr::select(Cluster, motif_name, annotation_broad, category, pattern_class, bin_dist, n_per_cluster_per_bin = n) %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
mutate(bin_dist_coarse = case_when(
bin_dist <= 50 ~ "0-50 bp",
bin_dist <= 100 ~ "51-100 bp",
bin_dist <= 150 ~ "101-150 bp",
bin_dist <= 200 ~ "151-200 bp",
bin_dist <= 250 ~ "201-250 bp",
TRUE ~ "> 251 bp"
)) %>%
group_by(annotation_broad, pattern_class, bin_dist_coarse) %>%
summarize(n_per_bin = sum(n_per_cluster_per_bin)) %>%
mutate(n_total = sum(n_per_bin),
prop_ber_bin = n_per_bin / n_total)## `summarise()` has grouped output by 'annotation_broad', 'pattern_class'. You
## can override using the `.groups` argument.
cmap_bins <- RColorBrewer::brewer.pal(6, "BuGn")
names(cmap_bins) <- rev(c("0-50 bp", "51-100 bp", "101-150 bp", "151-200 bp", "201-250 bp", "> 251 bp"))
motif_order_dist <- hit_summit_dist_coarse %>% filter(bin_dist_coarse == "0-50 bp") %>% arrange(desc(prop_ber_bin)) %>% pull(annotation_broad) %>% unique()
hit_summit_dist_coarse %>%
mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = names(cmap_bins)),
annotation_broad = factor(annotation_broad, levels = motif_order_dist)) %>%
ggplot(aes(x = annotation_broad, y = prop_ber_bin)) +
geom_bar(stat = "identity", aes(fill = bin_dist_coarse)) +
scale_fill_manual(values = cmap_bins) +
rotate_x()Get proportions of motifs per bin:
hit_summit_dist_coarse %>%
mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = rev(names(cmap_bins)))) %>%
group_by(bin_dist_coarse) %>%
summarize(n_per_bin_across_motifs = sum(n_per_bin)) %>%
mutate(n_total = sum(n_per_bin_across_motifs),
prop = n_per_bin_across_motifs / n_total,
cum_n = cumsum(n_per_bin_across_motifs),
cum_prop = cum_n / n_total) %>%
arrange(bin_dist_coarse)## # A tibble: 6 × 6
## bin_dist_coarse n_per_bin_across_motifs n_total prop cum_n cum_prop
## <fct> <int> <int> <dbl> <int> <dbl>
## 1 0-50 bp 43674790 120645979 0.362 43674790 0.362
## 2 51-100 bp 21467522 120645979 0.178 65142312 0.540
## 3 101-150 bp 13494671 120645979 0.112 78636983 0.652
## 4 151-200 bp 9438266 120645979 0.0782 88075249 0.730
## 5 201-250 bp 7483429 120645979 0.0620 95558678 0.792
## 6 > 251 bp 25087301 120645979 0.208 120645979 1
region_type_summed <- hits_all_anno %>%
filter(annotation_broad %in% motifs_keep) %>%
dplyr::select(motif_name, annotation_broad, Distal, Exonic, Intronic, Promoter) %>%
gather(region_type, count, 3:ncol(.)) %>%
group_by(annotation_broad, region_type) %>%
summarize(count_total = sum(count, na.rm = TRUE)) %>%
mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)))## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
p3 <- region_type_summed %>%
ggplot(aes(x = annotation_broad, y = count_total)) +
geom_col(aes(fill = region_type), position = "fill", alpha = 0.7) +
scale_fill_manual(values = cmap_peaktype2) +
coord_flip() +
theme(legend.position = "bottom")
plot_grid(p1, p2 , p3, nrow = 1, align = "h", axis = "tb", rel_widths = c(2, 2, 1))Load known motifs from Vierstra input set:
vierstra_motifs <- read_cwm_meme(file.path(kundaje_dir, "refs/Vierstra_motifs/all.dbs.meme"))We want to describe motifs in a given subset. This plots:
base_broad <- motifs_compiled_unique %>%
filter(category %in% c("base", "base_with_flank"))
length(base_broad$annotation_broad %>% unique())## [1] 51
plot_motif_summary(base_broad,
hits_all_anno,
hit_dyad_dist,
order_by = "distToTSS",
plot_known_motifs = TRUE,
# manually indicate which motifs to reverse complement
# to match known motif orientation
to_revcomp = c("GATA", "CTCF", "RFX", "MEF2", "NFKB/RELA", "NR", "ONECUT",
"ZNF143", "NR:HNF4", "REST", "THRA/B", "HD:SIX/ZNF", "ZEB/SNAI", "YY1/2rep"))## @ plotting 51 motifs
## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
homocomp_broad <- motifs_compiled_unique %>%
filter(pattern_class == "pos_patterns" & category == "homocomposite")
length(unique(homocomp_broad$annotation_broad))## [1] 22
plot_motif_summary(homocomp_broad,
hits_all_anno,
hit_dyad_dist,
plot_known_motifs = TRUE,
rel_widths = c(3.5, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
to_revcomp = c("ETS_ETS", "IRF/STAT_IRF/STAT", "BZIP_BZIP", "HD:PITX/OTX_HD:PITX/OTX"))## @ plotting 22 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
Just showing the top 30 here, because there are many.
heterocomp_broad <- motifs_compiled_unique %>%
filter(category == "heterocomposite" & !grepl("unresolved", annotation))
heterocomp_broad %>%
group_by(annotation_broad) %>%
count() %>%
arrange(desc(n))## # A tibble: 95 × 2
## # Groups: annotation_broad [95]
## annotation_broad n
## <chr> <int>
## 1 BHLH_HD 11
## 2 ETS_SP/KLF 9
## 3 ETS_NR 8
## 4 FOX_NKX 8
## 5 ETS_SOX 7
## 6 BHLH_BZIP 6
## 7 BHLH_MEF2 6
## 8 BHLH_NFI 6
## 9 ETS_NFI 6
## 10 GATA_MEF2 5
## # ℹ 85 more rows
plot_motif_summary(heterocomp_broad,
hits_all_anno,
hit_dyad_dist,
plot_known_motifs = FALSE,
# known_motifs = jolma_motifs,
# known_motif_col = "match0_jolma",
# known_motif_name = "match0_jolma",
rel_widths = c(3, 0.5, 1, 1, 1, 1, 1, 1.5, 1.5, 1),
top_n = 30)## @ plotting 30 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
neg_patterns <- motifs_compiled_unique %>%
filter(pattern_class == "neg_patterns") %>%
filter(!(category == "unresolved"))
# we can make use of the plotting function by setting the broad annotation to
# the granular one
plot_motif_summary(neg_patterns %>% mutate(annotation_broad = motif_name),
hits_all_anno %>% mutate(annotation_broad = motif_name),
hit_dyad_dist %>% mutate(annotation_broad = motif_name),
plot_known_motifs = TRUE,
rel_widths = c(4, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
to_revcomp = c("456|ZEB/SNAI", "455|YY1/2repressive"))## @ plotting 11 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
unresolved <- motifs_compiled_unique %>%
filter(category == "unresolved") %>%
arrange(desc(total_hits))
plot_motif_summary(unresolved %>% mutate(annotation_broad = motif_name),
hits_all_anno %>% mutate(annotation_broad = motif_name),
hit_dyad_dist %>% mutate(annotation_broad = motif_name),
order_by = "distToTSS",
plot_known_motifs = FALSE,
rel_widths = c(3.5, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
top_n = 30)## @ plotting 30 motifs
## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
.libPaths()## [1] "/oak/stanford/groups/wjg/sjessa/projects/HDMA/firstpass/renv/library/R-4.1/x86_64-pc-linux-gnu"
## [2] "/share/software/user/open/R/4.1.2/lib64/R/library"
## [3] "/oak/stanford/groups/akundaje/sjessa/software/R"
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_4.2.4 RColorBrewer_1.1-3 ggrastr_1.0.1
## [4] magrittr_2.0.3 GenomicFeatures_1.46.5 AnnotationDbi_1.56.2
## [7] Biobase_2.54.0 GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [10] IRanges_2.28.0 S4Vectors_0.32.4 BiocGenerics_0.40.0
## [13] ggseqlogo_0.1 plotly_4.10.4 cowplot_1.1.3
## [16] ggrepel_0.9.3 stringr_1.5.0 purrr_1.0.2
## [19] readr_2.1.4 tidyr_1.3.0 dplyr_1.1.3
## [22] ggplot2_3.5.1 scales_1.3.0 glue_1.8.0
## [25] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.7.2 colorspace_2.1-0
## [3] rjson_0.2.21 rprojroot_2.0.3
## [5] XVector_0.34.0 rstudioapi_0.15.0
## [7] farver_2.1.1 bit64_4.0.5
## [9] fansi_1.0.4 xml2_1.3.5
## [11] cachem_1.0.8 knitr_1.43
## [13] jsonlite_1.8.7 Rsamtools_2.10.0
## [15] dbplyr_2.3.3 png_0.1-8
## [17] compiler_4.1.2 httr_1.4.7
## [19] Matrix_1.6-3 fastmap_1.1.1
## [21] lazyeval_0.2.2 cli_3.6.1
## [23] htmltools_0.5.6 prettyunits_1.1.1
## [25] tools_4.1.2 gtable_0.3.4
## [27] GenomeInfoDbData_1.2.7 rappdirs_0.3.3
## [29] Rcpp_1.0.11 jquerylib_0.1.4
## [31] vctrs_0.6.3 Biostrings_2.62.0
## [33] rtracklayer_1.54.0 xfun_0.40
## [35] lifecycle_1.0.3 restfulr_0.0.15
## [37] gtools_3.9.4 XML_3.99-0.14
## [39] zlibbioc_1.40.0 MASS_7.3-60
## [41] vroom_1.6.3 hms_1.1.3
## [43] MatrixGenerics_1.6.0 parallel_4.1.2
## [45] SummarizedExperiment_1.24.0 yaml_2.3.7
## [47] curl_5.0.2 gridExtra_2.3
## [49] memoise_2.0.1 sass_0.4.7
## [51] biomaRt_2.50.3 stringi_1.7.12
## [53] RSQLite_2.3.1 highr_0.10
## [55] BiocIO_1.4.0 universalmotif_1.12.4
## [57] ArchR_1.0.2 filelock_1.0.2
## [59] BiocParallel_1.28.3 rlang_1.1.1
## [61] pkgconfig_2.0.3 bitops_1.0-7
## [63] matrixStats_1.0.0 evaluate_0.21
## [65] lattice_0.20-45 GenomicAlignments_1.30.0
## [67] htmlwidgets_1.6.2 labeling_0.4.3
## [69] bit_4.0.5 tidyselect_1.2.0
## [71] plyr_1.8.8 R6_2.5.1
## [73] generics_0.1.3 DelayedArray_0.20.0
## [75] DBI_1.1.3 pillar_1.9.0
## [77] withr_2.5.0 KEGGREST_1.34.0
## [79] RCurl_1.98-1.12 tibble_3.2.1
## [81] crayon_1.5.2 utf8_1.2.3
## [83] BiocFileCache_2.2.1 tzdb_0.4.0
## [85] rmarkdown_2.24 viridis_0.6.4
## [87] progress_1.2.2 data.table_1.14.8
## [89] blob_1.2.4 digest_0.6.33
## [91] munsell_0.5.0 beeswarm_0.4.0
## [93] viridisLite_0.4.2 vipor_0.4.5
## [95] bslib_0.5.1